# -*- coding:utf-8 -*-

from lxml import etree
import re

data = '<div class="aa">' \
           '<a>不要</a>' \
           '<span class="vv">' \
               '<span class="aa">文本a</span>' \
               '<br>这是文本1' \
               '<br>这是文本2' \
               '<a class="bb">文本b</a>' \
               '<a >文本c</a>' \
               '<a >文本d</a>' \
           '</span>' \
           '<br>表情文本' \
           '<a id="M_">文本要</a>' \
           '<span class="dd"><a>不要</a></span>' \
       '</div>'
data = etree.HTML(data)
result = data.xpath('//span[@class="aa"]/following::text()')
print(result)
result = data.xpath('//span[@class="aa"]/following::text()[1]')
print(result)
result = data.xpath('//span[@class="aa"]/following::text()[3]')
print(result)

print("*"*15)
# 匹配该节点之后的所有节点（包括兄弟节点的子节点）
result = data.xpath('//div[@class="aa"]/a[1]/following::text()')
print(result)
result = data.xpath('//div[@class="aa"]/a[1]/following-sibling::text()')
print(result)
# 匹配所有元素节点
result = data.xpath('//div[@class="aa"]/a[1]/following-sibling::*//text()')
print(result)

result = data.xpath('//div[@class="aa"]/span[@class="vv"]/following::node()[position() <= count(//span[@class="dd"]/preceding-sibling::node()) - 1]/text()')
print(result)

print("*"*15)

data = '<div>' \
            '<a id="1" href="www.baidu.com">我是第1个a标签</a>' \
            '<p>我是p标签</p>' \
            '<a id="2" href="www.baidu.com">我是第2个a标签</a>' \
            '<a id="3" href="www.baidu.com">我是第3个a标签</a>' \
            '<a id="4" href="www.baidu.com">我是第4个a标签</a>' \
            '<p>我是p标签</p>' \
            '<a id="5" href="www.baidu.com">我是第5个a标签</a>' \
        '</div>'
data = etree.HTML(data)
result = data.xpath("//a[@id='3']/following-sibling::a[2]/text()")
print(result)
result = data.xpath("//a[@id='3']/following-sibling::*[2]/text()")
print(result)
result = data.xpath("//a[@id='3']/preceding-sibling::a[2]/text()")
print(result)
result = data.xpath("//a[@id='3']/preceding-sibling::*[2]/text()")
print(result)
result = data.xpath("//a[@id='3']/../a[1]/text()")
print("获取第三个a标签的父标签（的第一个a标签）：" + result[0])

print("*"*50)

data1 = '<div class="a">' \
       '不需要的数据1' \
       '<div class="b">text1</div>' \
       '需要的数据1' \
       '<a href="baidu.com">抽奖详情</a>' \
       '<div class="d">text2</div>' \
       '</div>'

data2 = '<div class="a">' \
       '不需要的数据1' \
       '<div class="b">text1</div>' \
       '<br>' \
       '需要的数据1' \
       '<br>' \
       '需要的数据2' \
       '<a href="baidu.com">抽奖详情</a>' \
       '<div class="d">text2</div>' \
       '<div class="e">text3</div>' \
       '<div class="f">text4</div>' \
       '</div>'
data = etree.HTML(data1)
result = data.xpath('//div[@class="a"]//div[contains(text(),"text1")]/following-sibling::node()[position() <= count( div[1]//div[contains(text(),"text2")]/following-sibling::node()) + 1]')
print(result)

data = etree.HTML(data2)
result = data.xpath('//div[@class="a"]//div[@class="b"]/following-sibling::node()[position() <= count(//div[@class="a"]//div[@class="d"]/preceding-sibling::node())]/text()')
print(result)

data = etree.HTML(data2)
result = data.xpath('//div[@class="a"]//div[@class="b"]/following-sibling::node()[position() <= count(//div[@class="a"]//div[@class="d"]/preceding-sibling::node()) - 1]/text()')
print(result)

data = etree.HTML(data2)
result = data.xpath('//div[@class="a"]//div[@class="b"]/following-sibling::node()[position() <= count(//div[@class="a"]//div[@class="d"]/preceding-sibling::node()) - 2]/text()')
print(result)

data = etree.HTML(data2)
result = data.xpath('//div[@class="a"]//div[@class="b"]/following-sibling::node()[position() <= count(//div[@class="a"]//div[@class="d"]/preceding-sibling::node()) - 2]/text()')
print(result)

print("*"*100)

# final_data = '<div class="c" id="M_"><div>    <a href="/loris0527">桃子carol</a><img src="https://h5.sinaimg.cn/upload/2016/05/26/319/5338.gif" alt="V"><img src="https://h5.sinaimg.cn/upload/2016/05/26/319/donate_btn_s.png" alt="M">    <span class="ctt">:超级下饭的双十一购物分享！<img alt="[憧憬]" src="//h5.sinaimg.cn/m/emoticon/icon/default/d_xingxingyan-06a3ca0ae4.png" style="width:1em; height:1em;"></span><br>来康康我双十一都买了啥？是值得买的囤货？或者是买到踩雷单品？👌🏽<br><a href="https://weibo.cn/pages/100808topic?extparam=%E5%8F%8C%E5%8D%81%E4%B8%80%E5%BC%80%E7%AE%B1&amp;from=feed">#双十一开箱#</a> 抗初老精华/比dyson还贵的丑萌加湿器/戒糖也能喝的气泡水<br>哈哈哈哈哈虽然来的晚了点 好在是双十二之前发出来了！😌<br>去翻了翻现在的价格 双十一真的划算很多 有些开始预告双十二价格力度都没那么大了～<br><br><a href="https://weibo.cn/pages/100808topic?extparam=%E7%BE%8E%E6%A1%83%E7%A7%8D%E8%8D%89%E6%9C%BA&amp;from=feed">#美桃种草机#</a> <a href="https://weibo.cn/pages/100808topic?extparam=%E6%8A%A4%E8%82%A4%E5%93%81%E7%A7%8D%E8%8D%89&amp;from=feed">#护肤品种草#</a> <a href="https://weibo.cn/pages/100808topic?extparam=%E5%90%83%E8%B4%A7%E7%A7%8D%E8%8D%89%E6%9C%BA&amp;from=feed">#吃货种草机#</a> <a href="https://weibo.cn/pages/100808topic?extparam=%E5%A5%BD%E7%89%A9%E6%8E%A8%E8%8D%90&amp;from=feed">#好物推荐#</a> <a href="https://weibo.cn/pages/100808topic?extparam=%E5%A5%BD%E7%89%A9%E5%88%86%E4%BA%AB&amp;from=feed">#好物分享#</a>  <a href="https://m.weibo.cn/s/video/show?object_id=1034:4445491912441865&amp;fromWap=1">桃子carol的微博视频</a> <a href="https://weibo.cn/sinaurl?f=w&amp;u=http%3A%2F%2Ft.cn%2FAikffZr3">抽奖详情</a>                    <!-- 是否进行翻译 -->        &nbsp;    <span class="ct">2019-12-03 19:50:26    </span>    &nbsp;<a href="/attention/add?uid=2385942587&amp;rl=2&amp;st=1eb20e">关注她</a>        &nbsp;<a href="/spam/?mid=Ij5IX3xqe&amp;fuid=2385942587&amp;type=1&amp;rl=2">举报</a>&nbsp;<a href="/fav/addFav/Ij5IX3xqe?rl=2&amp;st=1eb20e">收藏</a>&nbsp;<a href="/mblog/operation/Ij5IX3xqe?uid=2385942587&amp;rl=2">操作</a>    </div></div>'

# final_data = '<div class="c" id="M_"><div>    <a href="/monlikan">银河到月球</a><img src="https://h5.sinaimg.cn/upload/2016/05/26/319/5338.gif" alt="V"><img src="https://h5.sinaimg.cn/upload/2016/05/26/319/donate_btn_s.png" alt="M">    <span class="ctt">:化妆视频忘记更新微博啦！<br>美妆蛋还有超级平价的化妆刷我都有字幕哦<br><a href="https://weibo.cn/pages/100808topic?extparam=%E5%A5%BD%E7%89%A9%E5%88%86%E4%BA%AB&amp;from=feed">#好物分享#</a> <a href="https://m.weibo.cn/s/video/show?object_id=1034:4459622661029909&amp;fromWap=1">银河到月球的微博视频</a> </span>                    <!-- 是否进行翻译 -->        &nbsp;    <span class="ct">01月11日 19:03    </span>    &nbsp;<a href="/attention/add?uid=2741638307&amp;rl=2&amp;st=1eb20e">关注她</a>        &nbsp;<a href="/spam/?mid=Ip15afhZk&amp;fuid=2741638307&amp;type=1&amp;rl=2">举报</a>&nbsp;<a href="/fav/addFav/Ip15afhZk?rl=2&amp;st=1eb20e">收藏</a>&nbsp;<a href="/mblog/operation/Ip15afhZk?uid=2741638307&amp;rl=2">操作</a>    </div></div>'

# final_data = '<div class="c" id="M_"><div>    <a href="/u/6097637441">蔡徐坤后援法律站</a><img src="https://h5.sinaimg.cn/upload/2016/05/26/319/5338.gif" alt="V"><img src="https://h5.sinaimg.cn/upload/2016/05/26/319/donate_btn_s.png" alt="M">    <span class="ctt">:<a href="https://weibo.cn/pages/100808topic?extparam=%E8%94%A1%E5%BE%90%E5%9D%A4%5B%E8%B6%85%E8%AF%9D%5D&amp;from=feed">#蔡徐坤[超话]#</a>[爱你]<a href="https://weibo.cn/pages/100808topic?extparam=%E8%94%A1%E5%BE%90%E5%9D%A4c%E4%BD%8D%E5%87%BA%E9%81%93%E4%B8%A4%E5%91%A8%E5%B9%B4%E5%BF%AB%E4%B9%90&amp;from=feed">#蔡徐坤c位出道两周年快乐#</a> <br>🎉为庆祝蔡徐坤C位出道两周年，站站将抽取7个小猫咪包邮送出以下🎁（4.6加专5张及以上才能获得抽奖资格）：<br>1⃣费加罗2018年7月刊随机限量海报一张； <br>2⃣上海电视2018年4月封面杂志一本；<br>3⃣时尚Cosmo2019年7月刊封面杂志一本；<br>4⃣高丝suncut小金瓶一瓶；<br>5⃣袋鼠护发素一瓶；<br>6⃣芭莎艺术封面杂志一本；<br>7⃣时尚先生fine2019年11月刊封面杂志一本。<br><br>单枪匹马，四面伏敌，霜刃未试胸怀揽月之志。<br>宿夜蛰伏，一朝涅槃，刹那惊鸿得破苍穹而出。<br>渡妖重生，静守己心，狂风当歌不畏浮云沧海。<br>忆往昔运拙时艰，幸得觅知音见采。<br>九万里风鹏正举，六合间寰宇朗清。<br>愿此后征程坦荡，展乾坤志远心坚。<br>出道两周年，有幸陪伴，与有荣焉。<br><br>【参与抽奖条件】<br>①关注<a href="/n/%E8%94%A1%E5%BE%90%E5%9D%A4%E2%9E%95">@蔡徐坤➕</a><a href="/n/%E8%94%A1%E5%BE%90%E5%9D%A4%E5%90%8E%E6%8F%B4%E6%B3%95%E5%BE%8B%E7%AB%99">@蔡徐坤后援法律站</a><br>②转发本条微博<br>③4.6加专5张及以上<br><br>【开奖时间】2020年4月7日</span>                &nbsp;</div><div><a href="/mblog/pic/IC3EUh8QU?rl=2"><img src="http://wx4.sinaimg.cn/wap180/006EF42Zgy1gdjzhaq0rrj30u01hcgw6.jpg" alt="图片" class="ib"></a>&nbsp;<a href="/mblog/oripic?&amp;id=IC3EUh8QU&amp;u=006EF42Zgy1gdjzhaq0rrj30u01hcgw6&amp;rl=2">原图</a>                <!-- 是否进行翻译 -->        &nbsp;    <span class="ct">今天 13:30    </span>    &nbsp;<a href="/attention/add?uid=6097637441&amp;rl=2&amp;st=1eb20e">关注她</a>        &nbsp;<a href="/spam/?mid=IC3EUh8QU&amp;fuid=6097637441&amp;type=1&amp;rl=2">举报</a>&nbsp;<a href="/fav/addFav/IC3EUh8QU?rl=2&amp;st=1eb20e">收藏</a>&nbsp;<a href="/mblog/operation/IC3EUh8QU?uid=6097637441&amp;rl=2">操作</a>    </div></div>'

# final_data = '<div class="c" id="M_"><div>    <a href="/pikasam">PiKa_Sam</a><img src="https://h5.sinaimg.cn/upload/2016/05/26/319/5547.gif" alt="达人">    <span class="ctt">:原图→PS后<img alt="[允悲]" src="//h5.sinaimg.cn/m/emoticon/icon/default/d_yunbei-9aa3c436a4.png" style="width:1em; height:1em;"></span>搞了一个多钟，还有个视频不想弄了，心累<img alt="[黑线]" src="//h5.sinaimg.cn/m/emoticon/icon/default/d_heixian-bde08b426c.png" style="width:1em; height:1em;"> <a href="https://weibo.cn/sinaurl?f=w&amp;u=http%3A%2F%2Ft.cn%2Fz8Aa3ax&amp;ep=IC5o8dvH9%2C2001232624%2CIC5o8dvH9%2C2001232624">上海·顾村公园</a>             &nbsp;[<a href="/mblog/picAll/IC5o8dvH9?rl=2">组图共2张</a>]    &nbsp;</div><div><a href="/mblog/pic/IC5o8dvH9?rl=2"><img src="http://wx1.sinaimg.cn/wap180/774862f0gy1gdk74m85y9j23402c0e86.jpg" alt="图片" class="ib"></a>&nbsp;<a href="/mblog/oripic?&amp;id=IC5o8dvH9&amp;u=774862f0gy1gdk74m85y9j23402c0e86&amp;rl=2">原图</a>                <!-- 是否进行翻译 -->        &nbsp;    <span class="ct">今天 17:54    </span>    &nbsp;<a href="/attention/add?uid=2001232624&amp;rl=2&amp;st=1eb20e">关注他</a>        &nbsp;<a href="/spam/?mid=IC5o8dvH9&amp;fuid=2001232624&amp;type=1&amp;rl=2">举报</a>&nbsp;<a href="/fav/addFav/IC5o8dvH9?rl=2&amp;st=1eb20e">收藏</a>&nbsp;<a href="/mblog/operation/IC5o8dvH9?uid=2001232624&amp;rl=2">操作</a>    </div></div>'

final_data = '<div class="c" id="M_"><div>    <a href="/u/2817413475">OMG好物</a><img src="https://h5.sinaimg.cn/upload/2016/05/26/319/5338.gif" alt="V"><img src="https://h5.sinaimg.cn/upload/2016/05/26/319/donate_btn_s.png" alt="M">    <span class="ctt">:3ce 丝绒水唇釉，薄涂厚涂都敲美的哦<img alt="[羞嗒嗒]" src="//h5.sinaimg.cn/m/emoticon/icon/lxh/lxh_xiudada-f44e8f5688.png" style="width:1em; height:1em;"></span><br><a href="https://weibo.cn/pages/100808topic?extparam=%E7%BE%8E%E5%A6%86%E7%94%9F%E6%B4%BB&amp;from=feed">#美妆生活#</a><a href="https://weibo.cn/pages/100808topic?extparam=%E5%8F%A3%E7%BA%A2%E7%A7%8D%E8%8D%89&amp;from=feed">#口红种草#</a>             &nbsp;[<a href="/mblog/picAll/Ind5asrHp?rl=2">组图共9张</a>]    &nbsp;</div><div><a href="/mblog/pic/Ind5asrHp?rl=2"><img src="http://wx2.sinaimg.cn/wap180/a7ee5163gy1gaey7b2hakj20j60pl0u4.jpg" alt="图片" class="ib"></a>&nbsp;<a href="/mblog/oripic?&amp;id=Ind5asrHp&amp;u=a7ee5163gy1gaey7b2hakj20j60pl0u4&amp;rl=2">原图</a>                <!-- 是否进行翻译 -->        &nbsp;    <span class="ct">2019-12-30 21:55:37    </span>    &nbsp;<a href="/attention/add?uid=2817413475&amp;rl=2&amp;st=1eb20e">关注她</a>        &nbsp;<a href="/spam/?mid=Ind5asrHp&amp;fuid=2817413475&amp;type=1&amp;rl=2">举报</a>&nbsp;<a href="/fav/addFav/Ind5asrHp?rl=2&amp;st=1eb20e">收藏</a>&nbsp;<a href="/mblog/operation/Ind5asrHp?uid=2817413475&amp;rl=2">操作</a>    </div></div>'

final_data = etree.HTML(final_data)
content1 = final_data.xpath('//div[@id="M_"]/div[1]/span[@class="ctt"]//text()')
content2 = final_data.xpath('//div[@id="M_"]/div[1]/text()')
# content3 = final_data.xpath('//div[@id="M_"]/div[1]/span[@class="ctt"]/following-sibling::node()[position() <= count(//div[@id="M_"]/div[1]//span[@class="ct"]/preceding-sibling::node()) - 6]/text()')
content3 = final_data.xpath('//div[@id="M_"]/div[1]/span[@class="ctt"]/following-sibling::a/text()')
print([i.strip() for i in content1 if len(i.strip())>0])
print([i.strip() for i in content2 if len(i.strip())>0])
print([i.strip() for i in content3 if len(i.strip())>0])

content = ' '.join([i.strip() for i in content1+content2+content3 if len(i.strip())>0]) #空格连接
print(content)

content4 = final_data.xpath('//div[@id="M_"]/div[1]//text()')
content4 = [i.strip() for i in content4 if len(i.strip())>0]
content = []
for element in content4:
    if re.search("\d+:\d+", element):
        break
    else:
        content.append(element)
content = ' '.join(content)
print(content4)
print(content)